Student: Iam Mars

CALIFORNIA SCHOOL IMMUNIZATIONS DATA (California Dept. of Public Health)

First, let us load neccassary libraries and the actual dataset.

library(readr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
immune <- read_csv("kindergarten_CA.csv")
## Parsed with column specification:
## cols(
##   district = col_character(),
##   sch_code = col_double(),
##   county = col_character(),
##   pub_priv = col_character(),
##   school = col_character(),
##   enrollment = col_double(),
##   complete = col_double(),
##   start_year = col_double()
## )
str(immune)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 110382 obs. of  8 variables:
##  $ district  : chr  "Alameda Unified" "Alameda Unified" "Alameda Unified" "Alameda Unified" ...
##  $ sch_code  : num  6967434 6110779 6100374 6090013 6090039 ...
##  $ county    : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
##  $ pub_priv  : chr  "Private" "Public" "Public" "Public" ...
##  $ school    : chr  "ALAMEDA CHRTN" "BAY FARM ELEM" "EARHART (AMELIA) ELEM" "EDISON ELEM" ...
##  $ enrollment: num  12 78 77 56 41 75 40 80 61 49 ...
##  $ complete  : num  11 77 73 53 41 65 34 76 61 43 ...
##  $ start_year: num  2001 2001 2001 2001 2001 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   district = col_character(),
##   ..   sch_code = col_double(),
##   ..   county = col_character(),
##   ..   pub_priv = col_character(),
##   ..   school = col_character(),
##   ..   enrollment = col_double(),
##   ..   complete = col_double(),
##   ..   start_year = col_double()
##   .. )

This data contain 8 variabes and (originally) 110,382. The variables are: * district * sch_code * pub_priv * school * enrollment * complete * start_year

Summary Statistics

immune <- na.omit(immune)
immune <- mutate(immune, ratio_complete = (complete/enrollment)*100) #Creating a percentage for all completely immunized children.
completesum<-summary(immune$ratio_complete)
completesum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   88.06   95.00   90.33   98.46  100.00
hist1<-hist(immune$ratio_complete, main = "", col = 3,
     xlab = "Percentages of completely immunized students", breaks = 10)

Now, let us begin to clean the data?

First, let us create a percentage for the number of completely immunized students to total school population. Next, let us first find the top 5 counties, then filter for them. Also, let us create a percentage out of the number of children completely immunized to those enrolled. thousand_one<- filter(immune, start_year %in% c(“2001”, “2002”, “2003”, “2004”, “2005”, “2006”, “2007”, “2008”, “2009”, “2010”, “2011”, “2012”, “2013”, “2014”, “2015”))

top5<- immune %>%
  group_by(county) %>%
  summarize(enrolled = sum(enrollment, na.rm = TRUE)) %>%
  arrange(desc(enrolled)) %>%
  select(county)
immune1 <- immune %>%
  mutate(ratio_complete = (complete/enrollment)*100) %>%
  select(county, start_year, ratio_complete, enrollment, complete) %>%
  filter(county %in%  c("Los Angeles", "Orange", "San Diego", "San Bernardino", "Riverside"))

Next, let us begin to graph.

library(ggplot2)
library(RColorBrewer)
p1 <- ggplot(immune, aes(x = start_year, y = county, fill = ratio_complete)) +
  geom_tile(aes(color = "grey50")) +
  xlab("Year") +
  scale_fill_gradient(low = "lightblue", high = "darkred") +
  ggtitle("Percentages of completely immunized students per county (California)")
ggplotly(p1)
## Warning in matrix(g$fill_plotlyDomain, nrow = length(y), ncol =
## length(x), : data length [108732] is not a sub-multiple or multiple of the
## number of rows [58]
## Warning in matrix(g$hovertext, nrow = length(y), ncol = length(x), byrow =
## TRUE): data length [108732] is not a sub-multiple or multiple of the number
## of rows [58]
p2 <- ggplot(immune1, aes(x = start_year, y = county)) +
  geom_tile(aes(fill = ratio_complete)) +
  xlab("Year") +
  scale_fill_gradient(low = "lightblue", high = "darkred") +
  ggtitle("Percentages of completely immunized students by top 4 counties (California)")
ggplotly(p2)
## Warning in matrix(g$fill_plotlyDomain, nrow = length(y), ncol =
## length(x), : data length [55259] is not a sub-multiple or multiple of the
## number of rows [5]
## Warning in matrix(g$hovertext, nrow = length(y), ncol = length(x), byrow =
## TRUE): data length [55259] is not a sub-multiple or multiple of the number
## of rows [5]
p3 <- ggplot(immune, aes(x = start_year, y = pub_priv)) +
  geom_tile(aes(fill = ratio_complete)) +
  xlab("Year") +
  scale_fill_gradient(low = "lightblue", high = "darkred") + 
  ggtitle("Percentages of completely immunized students: Public vs Private (California)")
ggplotly(p3)
## Warning in matrix(g$fill_plotlyDomain, nrow = length(y), ncol =
## length(x), : data length [108730] is not a sub-multiple or multiple of the
## number of columns [15]
## Warning in matrix(g$hovertext, nrow = length(y), ncol = length(x), byrow =
## TRUE): data length [108730] is not a sub-multiple or multiple of the number
## of columns [15]
colMeans(immune[,"ratio_complete"])
## ratio_complete 
##       90.33241
thousand_one<- filter(immune, start_year %in% c("2001"))
hist2<-hist(thousand_one$ratio_complete, col = 3,
     xlab = "Percentages of completely immunized students", breaks = 10)

thousand_eleven<- filter(immune, start_year %in% c("2011"))
hist2<-hist(thousand_eleven$ratio_complete,  col = 3,
     xlab = "Percentages of completely immunized students", breaks = 10)

Summary:

This data comes from the state of California.